library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
library(gganimate)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
movies <- read_csv("data/movies.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## year = col_double(),
## budget = col_double(),
## budget_2013 = col_double(),
## period_code = col_double(),
## decade_code = col_double(),
## response = col_logical(),
## metascore = col_double(),
## imdb_rating = col_double(),
## imdb_votes = col_number(),
## error = col_logical()
## )
## ℹ Use `spec()` for the full column specifications.
raw_bechdel <- read_csv("data/raw_bechdel.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## title = col_character(),
## year = col_double(),
## id = col_double(),
## imdb_id = col_character(),
## rating = col_double()
## )
movies %>%
ggplot() +
aes(x = binary, y = imdb_rating) +
geom_boxplot()
## Warning: Removed 202 rows containing non-finite values (stat_boxplot).
subset_movies <- movies %>%
select(year, title, binary, budget_2013, plot, rated, language, country, writer, metascore, imdb_rating, director, actors, genre, awards, runtime) %>%
mutate(first_country = str_extract(country, pattern = "[A-z ]+")) %>%
mutate(first_genre = str_extract(genre, pattern = "[A-z ]+"))
unique(subset_movies$first_genre)
## [1] NA "Biography" "Action" "Drama" "Comedy"
## [6] "Crime" "Animation" "Horror" "Adventure" "Mystery"
## [11] "Fantasy" "Thriller" "Documentary" "Sci" "Musical"
## [16] "Family" "Romance" "Western"
What are some interesting variables/potential relationships?
subset_movies %>%
group_by(first_genre) %>%
mutate(count = n()) %>%
mutate(genre = if_else(count < 100, "Other", first_genre))
## # A tibble: 1,794 x 19
## # Groups: first_genre [18]
## year title binary budget_2013 plot rated language country writer metascore
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2013 21 &a… FAIL 13000000 <NA> <NA> <NA> <NA> <NA> NA
## 2 2012 Dredd… PASS 45658735 <NA> <NA> <NA> <NA> <NA> NA
## 3 2013 12 Ye… FAIL 20000000 In t… R English USA, UK "John… 97
## 4 2013 2 Guns FAIL 61000000 A DE… R English… USA "Blak… 55
## 5 2013 42 FAIL 40000000 The … PG-13 English USA "Bria… 62
## 6 2013 47 Ro… FAIL 225000000 A ba… PG-13 English… USA "Chri… 29
## 7 2013 A Goo… FAIL 92000000 John… R English… USA "Skip… 28
## 8 2013 About… PASS 12000000 At t… R English UK "Rich… 55
## 9 2013 Admis… PASS 13000000 A Pr… PG-13 English USA "Kare… 48
## 10 2013 After… FAIL 130000000 A cr… PG-13 English USA "Gary… 33
## # … with 1,784 more rows, and 9 more variables: imdb_rating <dbl>,
## # director <chr>, actors <chr>, genre <chr>, awards <chr>, runtime <chr>,
## # first_country <chr>, first_genre <chr>, count <int>
subset_movies %>%
select(year, metascore, imdb_rating, budget_2013, first_genre) %>%
drop_na() %>%
group_by(first_genre) %>%
mutate(count = n()) %>%
mutate(genre = if_else(count < 100, "Other", first_genre)) %>%
mutate(year = as.integer(year)) %>%
ggplot(aes(x = metascore, y = imdb_rating, size = budget_2013, colour = budget_2013)) +
geom_point(alpha = 0.7, show.legend = FALSE) +
scale_size(range = c(2, 12)) +
scale_x_log10() +
labs(title = 'Year: {frame_time}', x = 'Metascore', y = 'IMDB rating') +
transition_time(year) +
ease_aes('linear')
subset_movies %>%
select(year, binary) %>%
drop_na() %>%
mutate(year = as.integer(year)) %>%
ggplot(aes(x = binary, fill = binary)) +
geom_bar(show.legend = F) +
theme_minimal() +
scale_fill_manual(values = c("darkblue", "darkred")) +
labs(title = 'Year: {frame_time}', x = 'Bechdel test result', y = 'Count') +
transition_time(year) +
ease_aes('linear')
subset_movies %>%
select(year, binary) %>%
drop_na() %>%
mutate(year = as.integer(year)) %>%
filter(year > 1980) %>%
group_by(year, binary) %>%
summarise(count = n()) %>%
pivot_wider(names_from = binary,
values_from = count) %>%
mutate(percentage_Pass= round(PASS/(FAIL+PASS), 3),
percentage_Fail = round(FAIL/(FAIL+PASS), 3)) %>%
pivot_longer(names_to = "test_outcome",
values_to = "percentage",
names_prefix = "percentage_",
cols = percentage_Pass:percentage_Fail) %>%
mutate(label_position = if_else(test_outcome == "Pass", 0.1, 0.9)) %>%
ggplot(aes(x = "", y = percentage, fill = factor(test_outcome))) +
geom_col(show.legend = F, position = "fill") +
theme_classic() +
theme(axis.title = element_text(size = 20, face = "bold"),
axis.text = element_text(size = 15, face = "bold"),
plot.title = element_text(size = 30, face = "bold", hjust = -0.2),
plot.subtitle = element_text(size = 20)) +
scale_fill_manual(values = c("darkred", "darkblue")) +
geom_text(aes(y = label_position, label = test_outcome, col = test_outcome), size=6, show.legend = F) +
scale_y_continuous(labels = percent) +
labs(title = "Year: {frame_time}\n", x = NULL, y = "% of Bechdel test outcomes \n", subtitle = "") +
transition_time(year) +
ease_aes('linear')
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.